This tutorial is created by Juhee Bae to instruct "Explainable AI". Refer to the lecture and the code provided on Canvas. Do not share with external venues. Refer to this tutorial to solve the assignment.
import pandas as pd
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
import seaborn as sb
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
import lime
from lime.lime_tabular import LimeTabularExplainer
import time
df = pd.read_csv('../data/day-bikesharing.csv',parse_dates=['dteday'],index_col=1) # Load the data
df.shape
(731, 15)
def add_features(df):
df['year'] = df.index.year # yr exists already
df['month'] = df.index.month
df['day'] = df.index.day
df['dayofweek'] = df.index.dayofweek
2011.Jan.1 is a Saturday.
add_features(df)
df.head(3)
| instant | season | yr | mnth | holiday | weekday | workingday | weathersit | temp | atemp | hum | windspeed | casual | registered | cnt | year | month | day | dayofweek | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| dteday | |||||||||||||||||||
| 2011-01-01 | 1 | 1 | 0 | 1 | 0 | 6 | 0 | 2 | 0.344167 | 0.363625 | 0.805833 | 0.160446 | 331 | 654 | 985 | 2011 | 1 | 1 | 5 |
| 2011-01-02 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 2 | 0.363478 | 0.353739 | 0.696087 | 0.248539 | 131 | 670 | 801 | 2011 | 1 | 2 | 6 |
| 2011-01-03 | 3 | 1 | 0 | 1 | 0 | 1 | 1 | 1 | 0.196364 | 0.189405 | 0.437273 | 0.248309 | 120 | 1229 | 1349 | 2011 | 1 | 3 | 0 |
# Let's use 'dayofweek' instead of 'weekday'.
plt.figure(figsize=(16,3))
df['weekday'][:7].plot(linestyle='--', marker='o', color='k', ms=20,
title='weekday feature (confusing with \'6\' and \'0\' for weekends)')
plt.show()
plt.figure(figsize=(16,3))
df['dayofweek'][:14].plot(linestyle='--', marker='o', color='r', ms=20,
title='dayofweek feature')
plt.show()
# Day feature shows whether it is a start of the month or the end of the month (1 to 30 or 31).
plt.figure(figsize=(16,3))
df['day'][:33].plot(linestyle='--', marker='o', color='c', ms=20, title='day feature')
plt.show()
plt.figure(figsize=(16,3))
df['workingday'][:7].plot(linestyle='--', marker='o', color='b', ms=20, title='workingday feature')
plt.show()
plt.figure(figsize=(16,5))
df['season'].plot()
plt.show()
df.season.value_counts()
3 188 2 184 1 181 4 178 Name: season, dtype: int64
plt.figure(figsize=(16,5))
df['cnt'].plot()
plt.show()
plt.scatter(x=df.temp*41,y=df["cnt"]) # denormalize temperature by multiplying 41
plt.grid(True)
plt.xlabel('Temperature')
plt.ylabel('Rental Count')
plt.title('Temperature vs. Rental Count')
plt.show()
# from January to April, the demand is relatively low due to Winter
#(there are not a lot of people that's brave enough to bike on icy road, are there?).
# And the peak demand happends around Summer time (from May to October) and drop again.
group_month = df.groupby(['month'])
average_by_month = group_month['cnt'].mean()
plt.plot(average_by_month.index,average_by_month)
plt.xlabel('Month')
plt.ylabel('Count')
plt.xticks(np.arange(13))
plt.grid(True)
plt.title('Average Rental Count by Month')
Text(0.5, 1.0, 'Average Rental Count by Month')
#correlation of total cnt with other variables
plt.figure(figsize=(16,5))
df.corr()['cnt'].plot(kind='bar')
plt.show()
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| instant | 731.0 | 366.000000 | 211.165812 | 1.000000 | 183.500000 | 366.000000 | 548.500000 | 731.000000 |
| season | 731.0 | 2.496580 | 1.110807 | 1.000000 | 2.000000 | 3.000000 | 3.000000 | 4.000000 |
| yr | 731.0 | 0.500684 | 0.500342 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
| mnth | 731.0 | 6.519836 | 3.451913 | 1.000000 | 4.000000 | 7.000000 | 10.000000 | 12.000000 |
| holiday | 731.0 | 0.028728 | 0.167155 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| weekday | 731.0 | 2.997264 | 2.004787 | 0.000000 | 1.000000 | 3.000000 | 5.000000 | 6.000000 |
| workingday | 731.0 | 0.683995 | 0.465233 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
| weathersit | 731.0 | 1.395349 | 0.544894 | 1.000000 | 1.000000 | 1.000000 | 2.000000 | 3.000000 |
| temp | 731.0 | 0.495385 | 0.183051 | 0.059130 | 0.337083 | 0.498333 | 0.655417 | 0.861667 |
| atemp | 731.0 | 0.474354 | 0.162961 | 0.079070 | 0.337842 | 0.486733 | 0.608602 | 0.840896 |
| hum | 731.0 | 0.627894 | 0.142429 | 0.000000 | 0.520000 | 0.626667 | 0.730209 | 0.972500 |
| windspeed | 731.0 | 0.190486 | 0.077498 | 0.022392 | 0.134950 | 0.180975 | 0.233214 | 0.507463 |
| casual | 731.0 | 848.176471 | 686.622488 | 2.000000 | 315.500000 | 713.000000 | 1096.000000 | 3410.000000 |
| registered | 731.0 | 3656.172367 | 1560.256377 | 20.000000 | 2497.000000 | 3662.000000 | 4776.500000 | 6946.000000 |
| cnt | 731.0 | 4504.348837 | 1937.211452 | 22.000000 | 3152.000000 | 4548.000000 | 5956.000000 | 8714.000000 |
| year | 731.0 | 2011.500684 | 0.500342 | 2011.000000 | 2011.000000 | 2012.000000 | 2012.000000 | 2012.000000 |
| month | 731.0 | 6.519836 | 3.451913 | 1.000000 | 4.000000 | 7.000000 | 10.000000 | 12.000000 |
| day | 731.0 | 15.738714 | 8.809949 | 1.000000 | 8.000000 | 16.000000 | 23.000000 | 31.000000 |
| dayofweek | 731.0 | 3.002736 | 2.004787 | 0.000000 | 1.000000 | 3.000000 | 5.000000 | 6.000000 |
df.dtypes
instant int64 season int64 yr int64 mnth int64 holiday int64 weekday int64 workingday int64 weathersit int64 temp float64 atemp float64 hum float64 windspeed float64 casual int64 registered int64 cnt int64 year int64 month int64 day int64 dayofweek int64 dtype: object
Perform one hot encoding on categorical variables. Machine learning algorithms assume (and require) the data to be numeric, so the categorical data must be pre-processed as a numerical data. Categorical variables turn into binary features that are 'one hot' encoded, so if a feature is represented by that column, it receives a 1, otherwise a 0.
# Define the categorical variables
cat_var = ['season', 'holiday', 'workingday', 'weathersit', 'year']
# OneHotEncoder
ohe = OneHotEncoder(categories = 'auto')
# Fit the categorical variables to the encoder
encodeddf = ohe.fit_transform(df[cat_var])
# Create a DataFrame with the encoded value information
cat_df = pd.DataFrame(data = encodeddf.toarray(), columns = ohe.get_feature_names(cat_var))
cat_df
C:\Python310\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead. warnings.warn(msg, category=FutureWarning)
| season_1 | season_2 | season_3 | season_4 | holiday_0 | holiday_1 | workingday_0 | workingday_1 | weathersit_1 | weathersit_2 | weathersit_3 | year_2011 | year_2012 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 |
| 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 |
| 2 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 4 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 726 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
| 727 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
| 728 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
| 729 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 730 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
731 rows × 13 columns
# rename the column names from 1,2,3,4 to spring, summer, fall, winter
cat_df.rename(columns={'season_1': 'winter', 'season_2': 'spring', 'season_3': 'summer', 'season_4': 'fall'}, inplace=True)
cat_df.rename(columns={'weathersit_1': 'clear', 'weathersit_2': 'cloudy', 'weathersit_3': 'lightsnow.rain'}, inplace=True)
cat_df
| winter | spring | summer | fall | holiday_0 | holiday_1 | workingday_0 | workingday_1 | clear | cloudy | lightsnow.rain | year_2011 | year_2012 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 |
| 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 |
| 2 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 4 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 726 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
| 727 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
| 728 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
| 729 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 730 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
731 rows × 13 columns
Machine learning is sensitive to the relative scales of numerical variables. The range of all variables need to be normalized so that each feature contributes approximately proportionately to the calculation.
# Define the numerical variables
num_var = ['temp', 'atemp', 'hum', 'windspeed', 'month', 'day', 'dayofweek']
Create standardized, normalized
# StandardScaler object
scaler = StandardScaler()
# Fit the data to the scaler
numscaled = scaler.fit_transform(df[num_var])
Keep the original for later view
# for inverse transformation
inversed = scaler.inverse_transform(numscaled)
print(inversed)
[[ 0.344167 0.363625 0.805833 ... 1. 1. 5. ] [ 0.363478 0.353739 0.696087 ... 1. 2. 6. ] [ 0.196364 0.189405 0.437273 ... 1. 3. 0. ] ... [ 0.253333 0.2424 0.752917 ... 12. 29. 5. ] [ 0.255833 0.2317 0.483333 ... 12. 30. 6. ] [ 0.215833 0.223487 0.5775 ... 12. 31. 0. ]]
# Create DataFrame with original data
inversed_df = pd.DataFrame(data = inversed, columns = num_var)
inversed_df
# Calculate based on UCI info to retrieve the actual temperature and weather information
#temp : Normalized temperature in Celsius. The values are divided to 41 (max)
#atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
#hum: Normalized humidity. The values are divided to 100 (max)
#windspeed: Normalized wind speed. The values are divided to 67 (max)
inversed_origin = pd.concat([inversed_df.iloc[:,0] * 41, #temperature
inversed_df.iloc[:,1] * 50, #feeling temperature
inversed_df.iloc[:,2] * 100, #humidity
inversed_df.iloc[:,3] * 67], #wind speed
axis=1)
inversed_origin
| temp | atemp | hum | windspeed | |
|---|---|---|---|---|
| 0 | 14.110847 | 18.18125 | 80.5833 | 10.749882 |
| 1 | 14.902598 | 17.68695 | 69.6087 | 16.652113 |
| 2 | 8.050924 | 9.47025 | 43.7273 | 16.636703 |
| 3 | 8.200000 | 10.60610 | 59.0435 | 10.739832 |
| 4 | 9.305237 | 11.46350 | 43.6957 | 12.522300 |
| ... | ... | ... | ... | ... |
| 726 | 10.420847 | 11.33210 | 65.2917 | 23.458911 |
| 727 | 10.386653 | 12.75230 | 59.0000 | 10.416557 |
| 728 | 10.386653 | 12.12000 | 75.2917 | 8.333661 |
| 729 | 10.489153 | 11.58500 | 48.3333 | 23.500518 |
| 730 | 8.849153 | 11.17435 | 57.7500 | 10.374682 |
731 rows × 4 columns
# Units are different but still can review the distribution
inversed_origin.boxplot()
<AxesSubplot:>
# Time series of the four features
ax = inversed_origin.set_index(inversed_origin.index).plot(figsize=(20,5))
ax.set_xlabel('daily time (# rows)')
Text(0.5, 0, 'daily time (# rows)')
X_original = pd.concat([cat_df, inversed_origin, inversed_df[['month', 'day', 'dayofweek']]], axis=1)
X_original # Use this for SHAP plot view
| winter | spring | summer | fall | holiday_0 | holiday_1 | workingday_0 | workingday_1 | clear | cloudy | lightsnow.rain | year_2011 | year_2012 | temp | atemp | hum | windspeed | month | day | dayofweek | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 14.110847 | 18.18125 | 80.5833 | 10.749882 | 1.0 | 1.0 | 5.0 |
| 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 14.902598 | 17.68695 | 69.6087 | 16.652113 | 1.0 | 2.0 | 6.0 |
| 2 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 8.050924 | 9.47025 | 43.7273 | 16.636703 | 1.0 | 3.0 | 0.0 |
| 3 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 8.200000 | 10.60610 | 59.0435 | 10.739832 | 1.0 | 4.0 | 1.0 |
| 4 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 9.305237 | 11.46350 | 43.6957 | 12.522300 | 1.0 | 5.0 | 2.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 726 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 10.420847 | 11.33210 | 65.2917 | 23.458911 | 12.0 | 27.0 | 3.0 |
| 727 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 10.386653 | 12.75230 | 59.0000 | 10.416557 | 12.0 | 28.0 | 4.0 |
| 728 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 10.386653 | 12.12000 | 75.2917 | 8.333661 | 12.0 | 29.0 | 5.0 |
| 729 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 10.489153 | 11.58500 | 48.3333 | 23.500518 | 12.0 | 30.0 | 6.0 |
| 730 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 8.849153 | 11.17435 | 57.7500 | 10.374682 | 12.0 | 31.0 | 0.0 |
731 rows × 20 columns
# Create DataFrame with numerically scaled data
num_df = pd.DataFrame(data = numscaled, columns = num_var)
num_df
| temp | atemp | hum | windspeed | month | day | dayofweek | |
|---|---|---|---|---|---|---|---|
| 0 | -0.826662 | -0.679946 | 1.250171 | -0.387892 | -1.600161 | -1.674108 | 0.996930 |
| 1 | -0.721095 | -0.740652 | 0.479113 | 0.749602 | -1.600161 | -1.560522 | 1.496077 |
| 2 | -1.634657 | -1.749767 | -1.339274 | 0.746632 | -1.600161 | -1.446936 | -1.498809 |
| 3 | -1.614780 | -1.610270 | -0.263182 | -0.389829 | -1.600161 | -1.333351 | -0.999661 |
| 4 | -1.467414 | -1.504971 | -1.341494 | -0.046307 | -1.600161 | -1.219765 | -0.500513 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 726 | -1.318665 | -1.521108 | 0.175807 | 2.061426 | 1.588660 | 1.279122 | -0.001366 |
| 727 | -1.323224 | -1.346690 | -0.266238 | -0.452131 | 1.588660 | 1.392707 | 0.497782 |
| 728 | -1.323224 | -1.424344 | 0.878392 | -0.853552 | 1.588660 | 1.506293 | 0.996930 |
| 729 | -1.309558 | -1.490049 | -1.015664 | 2.069444 | 1.588660 | 1.619879 | 1.496077 |
| 730 | -1.528225 | -1.540482 | -0.354061 | -0.460201 | 1.588660 | 1.733465 | -1.498809 |
731 rows × 7 columns
Define X (input): Concatenate one hot encoded categorical + normalized numerical data
# Xdf: normalized
Xdf = pd.concat([cat_df, num_df], axis = 1)
Xdf.head(5)
| winter | spring | summer | fall | holiday_0 | holiday_1 | workingday_0 | workingday_1 | clear | cloudy | lightsnow.rain | year_2011 | year_2012 | temp | atemp | hum | windspeed | month | day | dayofweek | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | -0.826662 | -0.679946 | 1.250171 | -0.387892 | -1.600161 | -1.674108 | 0.996930 |
| 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | -0.721095 | -0.740652 | 0.479113 | 0.749602 | -1.600161 | -1.560522 | 1.496077 |
| 2 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | -1.634657 | -1.749767 | -1.339274 | 0.746632 | -1.600161 | -1.446936 | -1.498809 |
| 3 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | -1.614780 | -1.610270 | -0.263182 | -0.389829 | -1.600161 | -1.333351 | -0.999661 |
| 4 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | -1.467414 | -1.504971 | -1.341494 | -0.046307 | -1.600161 | -1.219765 | -0.500513 |
# Save column names for SHAP usage
NAMES = Xdf.columns
NAMES
Index(['winter', 'spring', 'summer', 'fall', 'holiday_0', 'holiday_1',
'workingday_0', 'workingday_1', 'clear', 'cloudy', 'lightsnow.rain',
'year_2011', 'year_2012', 'temp', 'atemp', 'hum', 'windspeed', 'month',
'day', 'dayofweek'],
dtype='object')
plt.subplots(figsize=(8, 6))
sb.heatmap(Xdf.corr(), cmap='coolwarm')
<AxesSubplot:>
# Check if there is any non available
Xdf.isna().any().sum()
0
Define Y (to predict): Extract target
# Here, indicate which target to predict (cnt or casual or registered)
to_predict = 'cnt'
# Here, indicate which target to predict (cnt or casual or registered)
y = df[to_predict]
# Normalize target data: Calculate mean and standard deviation
y_mu = y.mean(0)
y_sd = y.std(0)
print('y mean:', y_mu, ' y std:' , y_sd)
y_norm = (y - y_mu) / y_sd
y_norm = y_norm.values.ravel() #dataframe to 1D array
#y_norm
y mean: 4504.3488372093025 y std: 1937.2114516187676
Split training and test data
np.random.seed(1234)
# Split the data into training, validation, and test data:
X_train, X_test, y_train, y_test = train_test_split(Xdf, y_norm, test_size = 0.2, random_state=1)
# Validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
# A parameter grid for XGBoost
params = {
'min_child_weight': [1, 5, 10],
'gamma': [0.5, 1, 1.5, 2, 5],
'subsample': [0.6, 0.8, 1.0],
'colsample_bytree': [0.6, 0.8, 1.0],
'max_depth': [3, 4, 5]
}
# Build the model with
# XGBoost Training Parameter Reference:
# https://github.com/dmlc/xgboost
# https://xgboost.readthedocs.io/en/latest/parameter.html
# XGBRegressor object and GridSearch object
xgbmodel = xgb.XGBRegressor()
clf = GridSearchCV(estimator = xgbmodel,
param_grid = params,
cv = 3,
verbose=1)
# Fit the training data to the GridSearch object
t0 = time.time()
clf.fit(X_train, y_train)
t1 = time.time()
print(t1-t0, " sec taken")
Fitting 3 folds for each of 405 candidates, totalling 1215 fits 42.089298486709595 sec taken
# Display the best parameters from the GridSearch
print(clf.best_params_)
print(clf.best_score_)
{'colsample_bytree': 0.8, 'gamma': 0.5, 'max_depth': 4, 'min_child_weight': 5, 'subsample': 0.8}
0.8633448414950647
from sklearn._config import get_config, set_config
set_config(print_changed_only=False)
xgbmodel = xgb.XGBRegressor(**clf.best_params_, random_state=0)
# Fit the training data to the model
xgbmodel.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_val, y_val)])
[0] validation_0-rmse:0.84811 validation_1-rmse:0.86342 [1] validation_0-rmse:0.67246 validation_1-rmse:0.69370 [2] validation_0-rmse:0.54534 validation_1-rmse:0.58267 [3] validation_0-rmse:0.45971 validation_1-rmse:0.50922 [4] validation_0-rmse:0.40182 validation_1-rmse:0.45342 [5] validation_0-rmse:0.36729 validation_1-rmse:0.40947 [6] validation_0-rmse:0.33601 validation_1-rmse:0.38506 [7] validation_0-rmse:0.31756 validation_1-rmse:0.37200 [8] validation_0-rmse:0.30014 validation_1-rmse:0.36170 [9] validation_0-rmse:0.28967 validation_1-rmse:0.35561 [10] validation_0-rmse:0.28115 validation_1-rmse:0.34970 [11] validation_0-rmse:0.27210 validation_1-rmse:0.34349 [12] validation_0-rmse:0.27027 validation_1-rmse:0.33654 [13] validation_0-rmse:0.26416 validation_1-rmse:0.33288 [14] validation_0-rmse:0.25919 validation_1-rmse:0.33080 [15] validation_0-rmse:0.25919 validation_1-rmse:0.33081 [16] validation_0-rmse:0.25706 validation_1-rmse:0.32948 [17] validation_0-rmse:0.25709 validation_1-rmse:0.32944 [18] validation_0-rmse:0.25387 validation_1-rmse:0.32742 [19] validation_0-rmse:0.25387 validation_1-rmse:0.32742 [20] validation_0-rmse:0.24894 validation_1-rmse:0.32367 [21] validation_0-rmse:0.24895 validation_1-rmse:0.32364 [22] validation_0-rmse:0.24892 validation_1-rmse:0.32373 [23] validation_0-rmse:0.24595 validation_1-rmse:0.32242 [24] validation_0-rmse:0.24157 validation_1-rmse:0.32198 [25] validation_0-rmse:0.24158 validation_1-rmse:0.32204 [26] validation_0-rmse:0.24157 validation_1-rmse:0.32200 [27] validation_0-rmse:0.24158 validation_1-rmse:0.32201 [28] validation_0-rmse:0.24160 validation_1-rmse:0.32208 [29] validation_0-rmse:0.23884 validation_1-rmse:0.31778 [30] validation_0-rmse:0.23884 validation_1-rmse:0.31774 [31] validation_0-rmse:0.23884 validation_1-rmse:0.31773 [32] validation_0-rmse:0.23885 validation_1-rmse:0.31768 [33] validation_0-rmse:0.23886 validation_1-rmse:0.31765 [34] validation_0-rmse:0.23274 validation_1-rmse:0.31404 [35] validation_0-rmse:0.23275 validation_1-rmse:0.31403 [36] validation_0-rmse:0.23028 validation_1-rmse:0.30987 [37] validation_0-rmse:0.23027 validation_1-rmse:0.30990 [38] validation_0-rmse:0.23027 validation_1-rmse:0.30989 [39] validation_0-rmse:0.23028 validation_1-rmse:0.30993 [40] validation_0-rmse:0.23028 validation_1-rmse:0.30986 [41] validation_0-rmse:0.23028 validation_1-rmse:0.30986 [42] validation_0-rmse:0.23028 validation_1-rmse:0.30987 [43] validation_0-rmse:0.23028 validation_1-rmse:0.30985 [44] validation_0-rmse:0.23028 validation_1-rmse:0.30986 [45] validation_0-rmse:0.23029 validation_1-rmse:0.30985 [46] validation_0-rmse:0.23029 validation_1-rmse:0.30985 [47] validation_0-rmse:0.23030 validation_1-rmse:0.30983 [48] validation_0-rmse:0.23028 validation_1-rmse:0.30988 [49] validation_0-rmse:0.23028 validation_1-rmse:0.30987 [50] validation_0-rmse:0.23029 validation_1-rmse:0.30985 [51] validation_0-rmse:0.23028 validation_1-rmse:0.30988 [52] validation_0-rmse:0.23029 validation_1-rmse:0.30997 [53] validation_0-rmse:0.23028 validation_1-rmse:0.30988 [54] validation_0-rmse:0.23028 validation_1-rmse:0.30992 [55] validation_0-rmse:0.23028 validation_1-rmse:0.30988 [56] validation_0-rmse:0.23027 validation_1-rmse:0.30990 [57] validation_0-rmse:0.23027 validation_1-rmse:0.30990 [58] validation_0-rmse:0.23028 validation_1-rmse:0.30988 [59] validation_0-rmse:0.23027 validation_1-rmse:0.30991 [60] validation_0-rmse:0.23028 validation_1-rmse:0.30992 [61] validation_0-rmse:0.22742 validation_1-rmse:0.30678 [62] validation_0-rmse:0.22739 validation_1-rmse:0.30670 [63] validation_0-rmse:0.22738 validation_1-rmse:0.30665 [64] validation_0-rmse:0.22738 validation_1-rmse:0.30668 [65] validation_0-rmse:0.22738 validation_1-rmse:0.30669 [66] validation_0-rmse:0.22738 validation_1-rmse:0.30663 [67] validation_0-rmse:0.22738 validation_1-rmse:0.30662 [68] validation_0-rmse:0.22737 validation_1-rmse:0.30663 [69] validation_0-rmse:0.22532 validation_1-rmse:0.30405 [70] validation_0-rmse:0.22532 validation_1-rmse:0.30406 [71] validation_0-rmse:0.22532 validation_1-rmse:0.30405 [72] validation_0-rmse:0.22533 validation_1-rmse:0.30408 [73] validation_0-rmse:0.22533 validation_1-rmse:0.30407 [74] validation_0-rmse:0.22532 validation_1-rmse:0.30404 [75] validation_0-rmse:0.22533 validation_1-rmse:0.30404 [76] validation_0-rmse:0.22532 validation_1-rmse:0.30405 [77] validation_0-rmse:0.22532 validation_1-rmse:0.30405 [78] validation_0-rmse:0.22532 validation_1-rmse:0.30405 [79] validation_0-rmse:0.22372 validation_1-rmse:0.30350 [80] validation_0-rmse:0.22370 validation_1-rmse:0.30357 [81] validation_0-rmse:0.22370 validation_1-rmse:0.30358 [82] validation_0-rmse:0.22371 validation_1-rmse:0.30361 [83] validation_0-rmse:0.22372 validation_1-rmse:0.30350 [84] validation_0-rmse:0.22371 validation_1-rmse:0.30350 [85] validation_0-rmse:0.22371 validation_1-rmse:0.30350 [86] validation_0-rmse:0.22370 validation_1-rmse:0.30351 [87] validation_0-rmse:0.22369 validation_1-rmse:0.30355 [88] validation_0-rmse:0.22370 validation_1-rmse:0.30352 [89] validation_0-rmse:0.22370 validation_1-rmse:0.30352 [90] validation_0-rmse:0.22373 validation_1-rmse:0.30349 [91] validation_0-rmse:0.22369 validation_1-rmse:0.30354 [92] validation_0-rmse:0.22370 validation_1-rmse:0.30357 [93] validation_0-rmse:0.22369 validation_1-rmse:0.30355 [94] validation_0-rmse:0.22369 validation_1-rmse:0.30356 [95] validation_0-rmse:0.22369 validation_1-rmse:0.30355 [96] validation_0-rmse:0.22169 validation_1-rmse:0.30439 [97] validation_0-rmse:0.22168 validation_1-rmse:0.30437 [98] validation_0-rmse:0.22168 validation_1-rmse:0.30431 [99] validation_0-rmse:0.22167 validation_1-rmse:0.30431
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.8,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, gamma=0.5, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=5,
missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
num_parallel_tree=1, objective='reg:squarederror',
predictor='auto', random_state=0, reg_alpha=0, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.8,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, gamma=0.5, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=5,
missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
num_parallel_tree=1, objective='reg:squarederror',
predictor='auto', random_state=0, reg_alpha=0, ...)eval_result = xgbmodel.evals_result()
training_rounds = range(len(eval_result['validation_0']['rmse'])) #training
print(training_rounds)
range(0, 100)
plt.figure(figsize=(8,4))
plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error')
plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Test Error')
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('RMSE')
plt.title('Training Vs Test Error')
plt.legend()
plt.show()
ypredicted = xgbmodel.predict(X_test)
plt.figure(figsize=(10,3))
# Actual Vs Predicted
plt.plot(y_test, label='Actual')
plt.plot(ypredicted, label='Predicted')
plt.xlabel('Sample')
plt.ylabel('Count')
plt.xlim([100,150])
plt.title('Validation Dataset - Predicted Vs. Actual')
plt.legend()
plt.show()
# Over prediction and Under Prediction needs to be balanced
# Training Data Residuals
residuals = (y_test - ypredicted) # * y_sd + y_mu (for actual cnt numbers)
plt.figure(figsize=(10,3))
plt.hist(residuals)
plt.grid(True)
plt.xlabel('Actual - Predicted')
plt.ylabel('Count')
plt.title('Residuals Distribution')
plt.axvline(color='r')
plt.show()
print("RMSE: {0:0.4f}".format(mean_squared_error(y_test, ypredicted)))
RMSE: 0.0884
# Show the influencing features in order (with feature_importances_)
# Indicate which parameter used to decide the ranking of important features
important_features = pd.Series(data=xgbmodel.feature_importances_, index=NAMES)
important_features.sort_values(ascending=False,inplace=True)
print(important_features)
# There are different parameters:
# Refer to, https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/core.py
# * 'weight': the number of times a feature is used to split the data across all trees.
# * 'gain': the average gain across all splits the feature is used in.
# * 'cover': the average coverage across all splits the feature is used in.
# * 'total_gain': the total gain across all splits the feature is used in.
# * 'total_cover': the total coverage across all splits the feature is used in.
year_2012 0.334441 month 0.148238 year_2011 0.093973 atemp 0.091646 winter 0.060549 fall 0.057606 clear 0.034687 cloudy 0.033488 temp 0.031691 lightsnow.rain 0.028922 hum 0.022926 windspeed 0.017095 workingday_0 0.010901 holiday_0 0.009047 day 0.007691 dayofweek 0.007369 workingday_1 0.007166 spring 0.002563 holiday_1 0.000000 summer 0.000000 dtype: float32
important_features = pd.Series(data=xgbmodel.get_booster().get_score(importance_type='gain'), index=NAMES)
important_features.sort_values(ascending=False,inplace=True)
print(important_features)
year_2012 24.965618 month 11.065815 year_2011 7.015012 atemp 6.841238 winter 4.519948 fall 4.300218 clear 2.589336 cloudy 2.499860 temp 2.365730 lightsnow.rain 2.159016 hum 1.711424 windspeed 1.276122 workingday_0 0.813746 holiday_0 0.675331 day 0.574106 dayofweek 0.550102 workingday_1 0.534939 spring 0.191319 summer NaN holiday_1 NaN dtype: float64
# Show the influencing features in order (with get_booster().get_fscore())
# fscore is based on the number of times a feature is used in a tree
fscore = xgbmodel.get_booster().get_fscore().items()
lists = sorted(fscore, key=lambda x:x[1]) # sorted by value, return a list of tuples
fkey, fvalue = zip(*lists) # unpack a list of pairs into two tuples
plt.barh(fkey, fvalue)
plt.show()
# Select from the test set
# when atemp Low/hum Higher[301]:cnt627
test = X_test[ (X_test['atemp'] < -1) & (X_test['hum'] > 1)]
print(test.shape)
test.head(5)
(2, 20)
| winter | spring | summer | fall | holiday_0 | holiday_1 | workingday_0 | workingday_1 | clear | cloudy | lightsnow.rain | year_2011 | year_2012 | temp | atemp | hum | windspeed | month | day | dayofweek | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 35 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | -1.432558 | -1.420304 | 2.116697 | -0.379718 | -1.310268 | -1.219765 | 0.99693 |
| 301 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | -1.318665 | -1.513304 | 1.788822 | 2.077411 | 1.008875 | 1.506293 | 0.99693 |
PICK = 301 #atempLow/humHigher[301]:cnt627
Xdf[Xdf.index==PICK]
| winter | spring | summer | fall | holiday_0 | holiday_1 | workingday_0 | workingday_1 | clear | cloudy | lightsnow.rain | year_2011 | year_2012 | temp | atemp | hum | windspeed | month | day | dayofweek | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 301 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | -1.318665 | -1.513304 | 1.788822 | 2.077411 | 1.008875 | 1.506293 | 0.99693 |
# review the target values, y[301] has target cnt = 627 which is a very low bike rental count
print("target count is:", y[301])
target count is: 627
# For comparison, keep the indices that were used for SHAP cases
SAMPLE1 = 364 #winter, weekend day (from test)
SAMPLE2 = 180 #summer, working day (from test)
# Select one day for investigation with a force plot later
weekEnd_sample = Xdf.loc[[SAMPLE1]] #winter, weekend day (from test)
weekDay_sample = Xdf.loc[[SAMPLE2]] #summer, working day (from test)
weekEnd_sample_origin = X_original.loc[[SAMPLE1]]
weekDay_sample_origin = X_original.loc[[SAMPLE2]]
weekEnd_sample
| winter | spring | summer | fall | holiday_0 | holiday_1 | workingday_0 | workingday_1 | clear | cloudy | lightsnow.rain | year_2011 | year_2012 | temp | atemp | hum | windspeed | month | day | dayofweek | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 364 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | -0.466773 | -0.369869 | -0.084739 | 0.383083 | 1.58866 | 1.733465 | 0.99693 |
X_original.head(5)
| winter | spring | summer | fall | holiday_0 | holiday_1 | workingday_0 | workingday_1 | clear | cloudy | lightsnow.rain | year_2011 | year_2012 | temp | atemp | hum | windspeed | month | day | dayofweek | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 14.110847 | 18.18125 | 80.5833 | 10.749882 | 1.0 | 1.0 | 5.0 |
| 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 14.902598 | 17.68695 | 69.6087 | 16.652113 | 1.0 | 2.0 | 6.0 |
| 2 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 8.050924 | 9.47025 | 43.7273 | 16.636703 | 1.0 | 3.0 | 0.0 |
| 3 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 8.200000 | 10.60610 | 59.0435 | 10.739832 | 1.0 | 4.0 | 1.0 |
| 4 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 9.305237 | 11.46350 | 43.6957 | 12.522300 | 1.0 | 5.0 | 2.0 |
# let the LIME Explainer know which features are categorical
categorical_features = [0,1,2,3,4,5,6,7,8,9,10,11,12]
categorical_names = {}
for feature in categorical_features:
le = LabelEncoder()
le.fit(Xdf.iloc[:, feature])
categorical_names[feature] = le.classes_
# Initialize the explainer
# Note that this is a regression problem, not a classification
explainer = LimeTabularExplainer(X_train.values, feature_names=NAMES,
categorical_features=categorical_features,
categorical_names=categorical_names,
verbose=True, mode='regression')
print("winter, weekend day target count:", y[SAMPLE1])
print("summer, working day target count:", y[SAMPLE2])
winter, weekend day target count: 2485 summer, working day target count: 5515
plt.figure(figsize=(4,2))
ax = sb.histplot(data=df, x="cnt", bins=100)
pd.DataFrame((df['cnt']).describe())
| cnt | |
|---|---|
| count | 731.000000 |
| mean | 4504.348837 |
| std | 1937.211452 |
| min | 22.000000 |
| 25% | 3152.000000 |
| 50% | 4548.000000 |
| 75% | 5956.000000 |
| max | 8714.000000 |
weekEnd_sample_origin
| winter | spring | summer | fall | holiday_0 | holiday_1 | workingday_0 | workingday_1 | clear | cloudy | lightsnow.rain | year_2011 | year_2012 | temp | atemp | hum | windspeed | month | day | dayofweek | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 364 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 16.81 | 20.70605 | 61.5833 | 14.750318 | 12.0 | 31.0 | 5.0 |
# Explain one day: Weekend
# winter, weekend day target count: 2485
print("WEEKEND in Winter")
#Explanation object has information about features contributions to this particular prediction
exp_WE = explainer.explain_instance(weekEnd_sample.values[0], xgbmodel.predict, num_features=10)
exp_WE.show_in_notebook(show_all=False)
WEEKEND in Winter Intercept -0.1371735805936662 Prediction_local [-0.57309594] Right: -0.6958025
Based on the local model, the prediction value showed -0.597 while the actual model predicted -0.696, which is aligned and shows a negative rent prediction on this day.
The renting prediction is negatively affected by the year 2011 feature. Being winter and having a lower feeling temperature affect the rent prediction negatively.
Having no light snow or rain, low humidity, and clear skies raise the rent prediction, but not enough to push the scales toward positive.
# Explain one day: Weekend
# winter, weekend day target count: 2485
print("WEEKEND in Winter (restrict to 5 features)")
#Explanation object has information about features contributions to this particular prediction
exp_WE = explainer.explain_instance(weekEnd_sample.values[0], xgbmodel.predict, num_features=5)
exp_WE.show_in_notebook(show_all=False)
WEEKEND in Winter (restrict to 5 features) Intercept 0.26500944484935074 Prediction_local [-0.52061615] Right: -0.6958025
# Feature-Contribution mapping
pd.DataFrame(exp_WE.as_list(),columns=['Feature','Contribution'])
| Feature | Contribution | |
|---|---|---|
| 0 | year_2011=1.0 | -0.520235 |
| 1 | year_2012=0.0 | -0.421981 |
| 2 | winter=1.0 | -0.212996 |
| 3 | -0.73 < hum <= 0.04 | 0.195789 |
| 4 | clear=1.0 | 0.173798 |
# same with LIME plot
#with plt.style.context("ggplot"):
# exp_WE.as_pyplot_figure()
# Change input to another day for local explanation: Weekday
#Explanation object has information about features contributions to this particular prediction
# summer, working day target count: 5515
print("WEEKDAY in Summer")
exp_wd = explainer.explain_instance(weekDay_sample.values[0], xgbmodel.predict, num_features=25)
exp_wd.show_in_notebook(show_all=False)
WEEKDAY in Summer Intercept -0.3492401488179606 Prediction_local [0.09830227] Right: 0.22199456
# Note that the LIME prediction has changed with 5 features input in this case
# summer, working day target count: 5515
print("WEEKDAY in Summer (restrict to 5 features)")
exp_wd = explainer.explain_instance(weekDay_sample.values[0], xgbmodel.predict, num_features=5)
exp_wd.show_in_notebook(show_all=False)
WEEKDAY in Summer (restrict to 5 features) Intercept 0.1690683484924712 Prediction_local [-0.20076617] Right: 0.22199456
pd.DataFrame(exp_wd.as_list(),columns=['Feature','Contribution'])
| Feature | Contribution | |
|---|---|---|
| 0 | year_2011=1.0 | -0.524257 |
| 1 | year_2012=0.0 | -0.435940 |
| 2 | hum <= -0.73 | 0.214068 |
| 3 | atemp > 0.82 | 0.191429 |
| 4 | clear=1.0 | 0.184865 |
# how to get LIME Prediction value
exp_wd.intercept[1] + pd.DataFrame(exp_wd.as_list())[1].sum()
-0.20076616629950356
# Select one day for investigation
aL_hH = Xdf.loc[[301]] #atempLow/humHigher[301]:cnt627
aL_hL = Xdf.loc[[368]] #atempLow/humLower[368]:cnt2368
aH_hL = Xdf.loc[[551]] #atempHigh/humLower[551]:cnt6241
aH_hA = Xdf.loc[[584]] #atempHigh/humAvg[584]:cnt7273
aL_hH_origin = X_original.loc[[301]]
aL_hL_origin = X_original.loc[[368]]
aH_hL_origin = X_original.loc[[551]]
aH_hA_origin = X_original.loc[[584]]
aH_hA_origin
| winter | spring | summer | fall | holiday_0 | holiday_1 | workingday_0 | workingday_1 | clear | cloudy | lightsnow.rain | year_2011 | year_2012 | temp | atemp | hum | windspeed | month | day | dayofweek | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 584 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 30.169153 | 34.88105 | 70.375 | 7.832836 | 8.0 | 7.0 | 1.0 |
#1 [301]
print("atemp Lower/ hum Higher")
exp_aL_hH = explainer.explain_instance(aL_hH.values[0], xgbmodel.predict, num_features=5)
exp_aL_hH.show_in_notebook(show_all=False)
#2 [368]
print("atemp Lower/ hum Lower")
exp_aL_hL = explainer.explain_instance(aL_hL.values[0], xgbmodel.predict, num_features=5)
exp_aL_hL.show_in_notebook(show_all=False)
#3 [551]
print("atemp Higher/ hum Lower")
exp_aH_hL = explainer.explain_instance(aH_hL.values[0], xgbmodel.predict, num_features=5)
exp_aH_hL.show_in_notebook(show_all=False)
#4 [584]
print("atemp Higher/ hum Average")
exp_aH_hA = explainer.explain_instance(aH_hA.values[0], xgbmodel.predict, num_features=5)
exp_aH_hA.show_in_notebook(show_all=False)
atemp Lower/ hum Higher Intercept 0.6834259351334521 Prediction_local [-1.49897929] Right: -1.6550077
atemp Lower/ hum Lower Intercept -0.2691661773258217 Prediction_local [-0.46064575] Right: -0.8665453
atemp Higher/ hum Lower Intercept -0.8553477926513698 Prediction_local [0.81260366] Right: 0.89202654
atemp Higher/ hum Average Intercept -0.8617095307885089 Prediction_local [0.79119369] Right: 1.1825035
# normalized vs original: distribution of humidity, normalized value and ignore scientific format
df_hat_n = pd.DataFrame(Xdf['hum'].describe().apply(lambda x: format(x, 'f')))
df_hat_n['atemp'] = Xdf['atemp'].describe().apply(lambda x: format(x, 'f'))
df_hat_n['temp'] = Xdf['temp'].describe().apply(lambda x: format(x, 'f'))
df_hat = pd.DataFrame((df['hum']*100).describe())
df_hat['atemp'] = ((df['atemp']*50).describe())
df_hat['temp'] = ((df['temp']*41).describe())
display(df_hat)
display(df_hat_n)
| hum | atemp | temp | |
|---|---|---|---|
| count | 731.000000 | 731.000000 | 731.000000 |
| mean | 62.789406 | 23.717699 | 20.310776 |
| std | 14.242910 | 8.148059 | 7.505091 |
| min | 0.000000 | 3.953480 | 2.424346 |
| 25% | 52.000000 | 16.892125 | 13.820424 |
| 50% | 62.666700 | 24.336650 | 20.431653 |
| 75% | 73.020850 | 30.430100 | 26.872076 |
| max | 97.250000 | 42.044800 | 35.328347 |
| hum | atemp | temp | |
|---|---|---|---|
| count | 731.000000 | 731.000000 | 731.000000 |
| mean | -0.000000 | 0.000000 | -0.000000 |
| std | 1.000685 | 1.000685 | 1.000685 |
| min | -4.411486 | -2.427296 | -2.384871 |
| 25% | -0.758047 | -0.838267 | -0.865385 |
| 50% | -0.008621 | 0.076015 | 0.016117 |
| 75% | 0.718845 | 0.824368 | 0.874845 |
| max | 2.421148 | 2.250800 | 2.002355 |